3장 - 그래프 인과모델

3.1 인과관계에 대해 생각해보기

인과모델은 변수들 사이의 인과 관계를 방향성이 있는 그래프(Directed Acyclic Graph, DAG)로 표현합니다. 노드는 변수를, 화살표는 직접적인 인과 영향을 나타냅니다.

import warnings

warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import graphviz as gr

color = ["0.3", "0.5", "0.7", "0.9"]
linestyle = ["-", "--", ":", "-."]
marker = ["o", "v", "d", "p"]

pd.set_option("display.max_rows", 6)

gr.set_default_format("png");
import pandas as pd

data = pd.read_csv("../data/cross_sell_email.csv")
data
gender cross_sell_email age conversion
0 0 short 15 0
1 1 short 27 0
2 1 long 17 0
... ... ... ... ...
320 0 no_email 15 0
321 1 no_email 16 0
322 1 long 24 1

323 rows × 4 columns

3.1.1 인과관계 시각화

인과 그래프를 사용하면 복잡한 변수 간의 관계를 시각적으로 명확하게 파악할 수 있습니다.

import graphviz as gr

g_cross_sell = gr.Digraph()

g_cross_sell.edge("U", "conversion")
g_cross_sell.edge("U", "age")
g_cross_sell.edge("U", "gender")

g_cross_sell.edge("rnd", "cross_sell_email")
g_cross_sell.edge("cross_sell_email", "conversion")
g_cross_sell.edge("age", "conversion")
g_cross_sell.edge("gender", "conversion")

g_cross_sell
g_cross_sell = gr.Digraph()

g_cross_sell.edge("U", "conversion")
g_cross_sell.edge("U", "age")
g_cross_sell.edge("U", "gender")

g_cross_sell.edge("rnd", "cross_sell_email")
g_cross_sell.edge("cross_sell_email", "conversion")
g_cross_sell.edge("age", "conversion")
g_cross_sell.edge("gender", "conversion")

g_cross_sell

# rankdir:LR layers the graph from left to right
g_cross_sell = gr.Digraph(graph_attr={"rankdir": "LR"})

g_cross_sell.edge("U", "conversion")
g_cross_sell.edge("U", "X")

g_cross_sell.edge("cross_sell_email", "conversion")
g_cross_sell.edge("X", "conversion")

g_cross_sell
g_cross_sell = gr.Digraph(graph_attr={"rankdir": "LR"})

g_cross_sell.edge("U", "conversion")
g_cross_sell.edge("U", "X")

g_cross_sell.edge("cross_sell_email", "conversion")
g_cross_sell.edge("X", "conversion")

g_cross_sell

3.1.2 컨설턴트 영입 여부 결정하기

컨설턴트 영입이 이윤에 미치는 영향을 파악하기 위해 그래프로 모델링해 봅니다.

3.2 그래프 모델 집중 훈련

3.2.1 사슬

g = gr.Digraph(graph_attr={"rankdir": "LR"})

g.edge("T", "M")
g.edge("M", "Y")
g.node("M", "M")


g.edge("causal knowledge", "solve problems")
g.edge("solve problems", "job promotion")

g

g = gr.Digraph(graph_attr={"rankdir": "LR"})

g.edge("T", "M")
g.edge("M", "Y")
g.node("M", "M")
g.node("M", color="lightgrey", style="filled")


g.edge("causal knowledge", "solve problems")
g.edge("solve problems", "job promotion")
g.node("solve problems", color="lightgrey", style="filled")

g

3.2.2 분기

g = gr.Digraph()


g.edge("X", "Y")
g.edge("X", "T")
g.node("X", "X")

g.edge("statistics", "causal inference")
g.edge("statistics", "machine learning")

g

g = gr.Digraph()

g.edge("good programmer", "can invert a binary tree")
g.edge("good programmer", "good employee")

g

3.2.3 충돌부

g = gr.Digraph()

g.edge("Y", "X")
g.edge("T", "X")

g.edge("statistics", "job promotion")
g.edge("flatter", "job promotion")

g

g = gr.Digraph()

g.edge("Y", "X1")
g.edge("T", "X1")
g.edge("X1", "X2")
g.node("X2", color="lightgrey", style="filled")

g.edge("statistics", "job promotion")
g.edge("flatter", "job promotion")
g.edge("job promotion", "high salary")

g.node("high salary", color="lightgrey", style="filled")

g

3.2.4 연관관계 흐름 커닝 페이퍼

3.2.5 파이썬에서 그래프 쿼리하기

g = gr.Digraph(graph_attr={"rankdir": "LR"})
g.edge("C", "A")
g.edge("C", "B")
g.edge("D", "A")
g.edge("B", "E")
g.edge("F", "E")
g.edge("A", "G")

g

import networkx as nx

model = nx.DiGraph(
    [
        ("C", "A"),
        ("C", "B"),
        ("D", "A"),
        ("B", "E"),
        ("F", "E"),
        ("A", "G"),
    ]
)
print("Are D and C dependent?")
print(not (nx.d_separated(model, {"D"}, {"C"}, {})))

print("Are D and C dependent given A?")
print(not (nx.d_separated(model, {"D"}, {"C"}, {"A"})))

print("Are D and C dependent given G?")
print(not (nx.d_separated(model, {"D"}, {"C"}, {"G"})))
Are D and C dependent?
False
Are D and C dependent given A?
True
Are D and C dependent given G?
True
print("Are G and D dependent?")
print(not (nx.d_separated(model, {"G"}, {"D"}, {})))

print("Are G and D dependent given A?")
print(not (nx.d_separated(model, {"G"}, {"D"}, {"A"})))
Are G and D dependent?
True
Are G and D dependent given A?
False
print("Are A and B dependent?")
print(not (nx.d_separated(model, {"A"}, {"B"}, {})))

print("Are A and B dependent given C?")
print(not (nx.d_separated(model, {"A"}, {"B"}, {"C"})))
Are A and B dependent?
True
Are A and B dependent given C?
False
print("Are G and F dependent?")
print(not (nx.d_separated(model, {"G"}, {"F"}, {})))

print("Are G and F dependent given E?")
print(not (nx.d_separated(model, {"G"}, {"F"}, {"E"})))
Are G and F dependent?
False
Are G and F dependent given E?
True

3.3 식별 재해석

consultancy_sev = gr.Digraph(graph_attr={"rankdir": "LR"})
consultancy_sev.edge("profits_prev_6m", "profits_next_6m")
consultancy_sev.edge("profits_prev_6m", "consultancy")

consultancy_sev

consultancy_model_severed = nx.DiGraph(
    [
        ("profits_prev_6m", "profits_next_6m"),
        ("profits_prev_6m", "consultancy"),
        #     ("consultancy", "profits_next_6m"), # causal relationship removed
    ]
)

not (
    nx.d_separated(consultancy_model_severed, {"consultancy"}, {"profits_next_6m"}, {})
)
True
g_consultancy = gr.Digraph(graph_attr={"rankdir": "LR"})
g_consultancy.edge("profits_prev_6m", "profits_next_6m")
g_consultancy.edge("profits_prev_6m", "consultancy")
g_consultancy.edge("consultancy", "profits_next_6m")
g_consultancy.node("profits_prev_6m", color="lightgrey", style="filled")

g_consultancy

3.4 조건부 독립성 가정과 보정 공식

3.5 양수성 가정

3.6 구체적인 식별 예제

df = pd.DataFrame(
    dict(
        profits_prev_6m=[1.0, 1.0, 1.0, 5.0, 5.0, 5.0],
        consultancy=[0, 0, 1, 0, 1, 1],
        profits_next_6m=[1, 1.1, 1.2, 5.5, 5.7, 5.7],
    )
)

df
profits_prev_6m consultancy profits_next_6m
0 1.0 0 1.0
1 1.0 0 1.1
2 1.0 1 1.2
3 5.0 0 5.5
4 5.0 1 5.7
5 5.0 1 5.7
(
    df.query("consultancy==1")["profits_next_6m"].mean()
    - df.query("consultancy==0")["profits_next_6m"].mean()
)
1.666666666666667
avg_df = df.groupby(["consultancy", "profits_prev_6m"])["profits_next_6m"].mean()

avg_df.loc[1] - avg_df.loc[0]
profits_prev_6m
1.0    0.15
5.0    0.20
Name: profits_next_6m, dtype: float64
g = gr.Digraph(graph_attr={"rankdir": "LR"})
g.edge("U", "T")
g.edge("U", "Y")
g.edge("T", "M")
g.edge("M", "Y")

g

3.7 교란편향

g = gr.Digraph(graph_attr={"rankdir": "LR"})
g.edge("X", "T")
g.edge("X", "Y")
g.edge("T", "Y")

(g.edge("Manager Quality", "Training"),)
(g.edge("Manager Quality", "Engagement"),)
g.edge("Training", "Engagement")

g

3.7.1 대리 교란 요인

g = gr.Digraph()
g.edge("X1", "U")
g.edge("U", "X2")
g.edge("U", "T")
g.edge("T", "Y")
g.edge("U", "Y")

g.edge("Manager Quality", "Team's Attrition")
g.edge("Manager Quality", "Team's Past Performance")
g.edge("Manager's Tenure", "Manager Quality")
g.edge("Manager's Education Level", "Manager Quality")

g.edge("Manager Quality", "Training")
g.edge("Training", "Engagement")
g.edge("Manager Quality", "Engagement")

g

3.7.2 랜덤화 재해석

g = gr.Digraph(graph_attr={"rankdir": "LR"})
g.edge("rnd", "T")
g.edge("T", "Y")
g.edge("U", "Y")

g

3.8 선택편향

3.8.1 충돌부 조건부 설정

g = gr.Digraph(graph_attr={"rankdir": "LR"})
g.edge("T", "S")
g.edge("T", "Y")
g.edge("Y", "S")
g.node("S", color="lightgrey", style="filled")

(g.edge("RND", "New Feature"),)
(g.edge("New Feature", "Customer Satisfaction"),)
(g.edge("Customer Satisfaction", "NPS"),)
(g.edge("Customer Satisfaction", "Response"),)
(g.edge("New Feature", "Response"),)
g.node("Response", "Response", color="lightgrey", style="filled")

g

nps_model = nx.DiGraph(
    [
        ("RND", "New Feature"),
        #     ("New Feature", "Customer Satisfaction"),
        ("Customer Satisfaction", "NPS"),
        ("Customer Satisfaction", "Response"),
        ("New Feature", "Response"),
    ]
)


not (nx.d_separated(nps_model, {"NPS"}, {"New Feature"}, {"Response"}))
True
np.random.seed(2)
n = 100000
new_feature = np.random.binomial(1, 0.5, n)

satisfaction_0 = np.random.normal(0, 0.5, n)
satisfaction_1 = satisfaction_0 + 0.4
satisfaction = new_feature * satisfaction_1 + (1 - new_feature) * satisfaction_0

nps_0 = np.random.normal(satisfaction_0, 1)
nps_1 = np.random.normal(satisfaction_1, 1)
nps = new_feature * nps_1 + (1 - new_feature) * nps_0


responded = (np.random.normal(0 + new_feature + satisfaction, 1) > 1).astype(int)

tr_df = pd.DataFrame(
    dict(
        new_feature=new_feature, responded=responded, nps_0=nps_0, nps_1=nps_1, nps=nps
    )
)

tr_df_measurable = pd.DataFrame(
    dict(
        new_feature=new_feature,
        responded=responded,
        nps_0=np.nan,
        nps_1=np.nan,
        nps=np.where(responded, nps, np.nan),
    )
)

tr_df.groupby("new_feature").mean()
responded nps_0 nps_1 nps
new_feature
0 0.183715 -0.005047 0.395015 -0.005047
1 0.639342 -0.005239 0.401082 0.401082
tr_df_measurable.groupby("new_feature").mean().assign(**{"nps": np.nan})
responded nps_0 nps_1 nps
new_feature
0 0.183715 NaN NaN NaN
1 0.639342 NaN NaN NaN
tr_df_measurable.groupby(["responded", "new_feature"]).mean()
nps_0 nps_1 nps
responded new_feature
0 0 NaN NaN NaN
1 NaN NaN NaN
1 0 NaN NaN 0.314073
1 NaN NaN 0.536106
tr_df.groupby(["responded", "new_feature"]).mean()
nps_0 nps_1 nps
responded new_feature
0 0 -0.076869 0.320616 -0.076869
1 -0.234852 0.161725 0.161725
1 0 0.314073 0.725585 0.314073
1 0.124287 0.536106 0.536106

3.8.2 선택편향 보정

g = gr.Digraph()

g.edge("U", "X")
g.edge("X", "S")
g.edge("U", "Y")
g.edge("T", "Y")
g.edge("T", "S")
g.node("S", color="lightgrey", style="filled")

(g.edge("New Feature", "Customer Satisfaction"),)
(g.edge("Unknown Stuff", "Customer Satisfaction"),)
(g.edge("Unknown Stuff", "Time in App"),)
(g.edge("Time in App", "Response"),)
(g.edge("New Feature", "Response"),)

g.node("Response", "Response", color="lightgrey", style="filled")

g

g = gr.Digraph(graph_attr={"rankdir": "LR"})

g.edge("X1", "U")
g.edge("U", "X2")
g.edge("X5", "S")
g.edge("U", "Y", style="dashed")
g.edge("U", "S", style="dashed")
g.edge("U", "X3")
g.edge("X3", "S")
g.edge("Y", "X4")
g.edge("X4", "S")
g.edge("T", "X5")
g.edge("T", "Y")
g.edge("T", "S", style="dashed")
g.node("S", color="lightgrey", style="filled")

g

g = gr.Digraph(graph_attr={"rankdir": "LR"})

g.edge("Y", "X")
g.edge("T", "X")
g.edge("T", "Y")

g;

3.8.3 매개자 조건부 설정

g = gr.Digraph(graph_attr={"rankdir": "LR"})
g.edge("T", "M")
g.edge("T", "Y")
g.edge("M", "Y")
g.node("M", color="lightgrey", style="filled")

g.edge("woman", "seniority")
g.edge("woman", "salary")
g.edge("seniority", "salary")
g.node("seniority", color="lightgrey", style="filled")

g

g = gr.Digraph(graph_attr={"rankdir": "LR"})
g.edge("T", "M")
g.edge("T", "Y")
g.edge("M", "Y")
g.edge("M", "X")
g.node("X", color="lightgrey", style="filled")

g

3.9 요약

g = gr.Digraph(graph_attr={"rankdir": "LR", "ratio": "0.3"})
g.edge("U", "T")
g.edge("U", "Y")
g.edge("T", "Y")

g

g = gr.Digraph(graph_attr={"rankdir": "LR"})
g.edge("T", "M")
g.edge("M", "Y")
g.edge("T", "Y")
g.edge("T", "S")
g.edge("Y", "S")

g.node("M", color="lightgrey", style="filled")
g.node("S", color="lightgrey", style="filled")

g

g = gr.Digraph(graph_attr={"rankdir": "LR"})
g.edge("T", "In-Game Purchase")
g.edge("T", "In-Game Purchase > 0")
g.edge("In-Game Purchase", "In-Game Purchase > 0")

g.node("In-Game Purchase > 0", color="lightgrey", style="filled")

g

g = gr.Digraph(graph_attr={"rankdir": "LR"})
g.edge("loan amount", "Default at yr=1")
g.edge("Default at yr=1", "Default at yr=2")
g.edge("Default at yr=2", "Default at yr=3")
g.edge("U", "Default at yr=1")
g.edge("U", "Default at yr=2")
g.edge("U", "Default at yr=3")

g.node("Default at yr=1", color="lightgrey", style="filled")
g.node("Default at yr=2", color="darkgrey", style="filled")

g